clear all
global system "linux"

if "${system}" == "linux" {
	global code "/"
	global s "/"
}

run "${code}${s}_set-path.do"

*** Import GSE loans before Harp ()
capture { // static files
	import delimited "${harp}/gse_id-harp-prev-loan.csv", clear // GSE loan-level origination file (non-harp)
	
	drop prod_type_orig // all value = FRM
	
	// cleaning date variables
	foreach x of varlist loan_close_date zero_balance_date {
		tostring `x', replace
		gen temp = mofd(date(`x', "YM"))
		drop `x'
		rename temp `x'
		format `x' %tm
	}
	save "${harp}/gse_id-harp-prev-loan.dta", replace
	erase "${harp}/gse_id-harp-prev-loan.csv"
}

capture { // dynamic files

	import delimited "${harp}/gse_id-harp-prev-loan-dynamic.csv", clear // GSE loan-level performance file (non-harp)

	tostring year_month_id, replace
	gen temp = mofd(date(year_month_id, "YM"))
	drop year_month_id
	rename temp ym
	format ym %tm
	sort mcdash_id ym
	save "${harp}/gse_id-harp-prev-loan-dynamic.dta", replace
	erase "${harp}/gse_id-harp-prev-loan-dynamic.csv"
	
	use "${harp}/gse_id-harp-prev-loan-dynamic.dta", clear
	sort mcdash_id ym
	by mcdash_id: keep if _n == _N-1
	*by mcdash_id: keep if _n == _N-2
	save "${harp}/gse_id-harp-prev-loan-lastperiod.dta", replace
}



*** Harp Performance data
use id_loan_harp reporting_prd current_upb loan_age mths* zero_bal* using "${Fannie}/harp_historical.dta", clear // HARP performance file
tostring id_loan_harp, replace format(%14.0g)

gen ym = date(reporting_prd, "MDY")
gen zero_bal_ym = date(zero_balance_date, "MY")
foreach x of varlist *ym {
	replace `x' = mofd(`x')
	format `x' %tm
}
drop reporting_prd zero_balance_date

gen agency_id = 1

append using "${Freddie}/harp_historical.dta" // HARP performance file
replace agency_id = 2 if agency_id==.
keep id_loan_harp reporting_prd current_upb loan_age mths* zero_bal* *ym* agency_id

replace ym = ym(floor(reporting_prd/100), reporting_prd - floor(reporting_prd/100)*100) if agency_id==2
replace zero_bal_ym = ym(floor(zero_balance_date/100), zero_balance_date - floor(zero_balance_date/100)*100) if agency_id==2
drop reporting_prd zero_balance_date

drop mths_*adj zero_bal_ym

// leave one record per loan id
sort id_loan_harp loan_age
by id_loan_harp: gen last = _n == _N
keep if inlist(loan_age, 12, 24, 36, 48, 60, 72, 84, 96)==1 | last==1

sort id_loan_harp loan_age
by id_loan_harp: egen max_ym = max(ym)
format max_ym %tm
by id_loan_harp: gen last_age = loan_age[_N]
by id_loan_harp: egen temp = max(zero_balance_id)
drop zero_balance_id 
rename temp zero_balance_id
gen prepaid = zero_balance_id~=. // =1 if prepaid in any way (refi, forelcosure, etc..)
gen forcl = zero_balance_id != 1 & prepaid == 1

// reshape the data
keep if inlist(loan_age, 12, 24, 36, 48, 60, 72, 84, 96)
keep id_loan_harp current_upb agency_id last_age zero_balance_id loan_age prepaid forcl max_ym
reshape wide current_upb, i(id_loan_harp) j(loan_age)

save "${cached}/harp-performance-combined.dta", replace


*** Combine harp data
use "${Fannie}/harp_static.dta", clear // HARP origination file
drop debt_to_income // this is missing for all observations
rename prop_typ prop_type_id

foreach x of varlist id_loan* {
	format `x' %14.0g
	tostring `x', replace format(%14.0g)
}
gen agency_id = 1
append using "${Freddie}/harp_static.dta"  // HARP origination file
drop servicer_name
replace agency_id = 2 if agency_id == .

*** Sample Selection
keep if num_units == 1
keep if orig_term == 360

*** fix the orig date
gen orig_ym = mofd(date(orig_date, "MY")) // only populated for Fannie
format orig_ym %tm

tostring dt_first_payment, replace // only for freddie
replace orig_ym = mofd(date(dt_first_payment, "YM")) - 2 if orig_ym == .

drop orig_date dt_first_payment first_date

*** fix differences in variable names  
replace orig_amount = orig_upb if orig_amount ==. // for freddie
drop orig_upb

// LTV
foreach x of varlist ltv cltv {
	replace o`x' = `x' if o`x' == .
	drop `x'
}

// credit score
destring credit_score, replace
replace credit_scorecore_b = credit_score if credit_scorecore_b == .
drop credit_score credit_scorecore_c

drop dt_matr cd_msa *fthb* purpose debt_to_income embs_product_type 

**** Merge info about previous loans
rename id_loan mcdash_id // this is loan id for the previous loan
sort mcdash_id id_loan_harp
by mcdash_id: gen temp = _N
drop if temp > 1 // droppping only 4 observations
drop temp

merge 1:1 mcdash_id using "${harp}/gse_id-harp-prev-loan-lastperiod.dta", keep(master matched)
keep if _merge == 3
drop _merge

keep if inrange(orig_ym -ym, 0,2) // drop unrealistic values (only small number)

*** Additional variables from previous loans
merge 1:1 mcdash_id using "${harp}/gse_id-harp-prev-loan.dta", keep(master matched) nogen keepusing(debt_to_income_ratio_orig orig_credit_score  loan_to_value_orig orig_loan_amt orig_int_rate loan_close_date)
rename loan_close_date loan_close_date_orig

foreach x of varlist debt_to_income_ratio_orig orig_credit_score  loan_to_value_orig orig_loan_amt orig_int_rate {
	qui su `x', d
	replace `x' = . if `x' <= `r(p1)' | `x' >= `r(p99)'
}

save "${temp}/gse_id-harp-temp.dta", replace
